The main page containing the enron data can be found at: https://www.cs.cmu.edu/~./enron/
In [178]:
import re
import numpy as np
import pandas as pd
import email
#Plotting stuff
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('poster')
import hypergraph
Read the data into hyperedges. We preserve order only in so far as the first element in each array is the sender. Email addresses may appear multiple times if they were included multiple times in the header. For example their exist cases where a given address was included in both the cc and the bcc lines of the same message.
to, cc and bcc addresses where treated the same and simply merged into a single list.
In [179]:
import os
#for root, user, file in os.walk('/Users/jchealy/Downloads/maildir/'):
root = '/Users/jchealy/Downloads/maildir/'
count =0
edges = []
file_index = []
for user in os.listdir(root):
#print(user)
location = root+user+'/_sent_mail'
if(count>2):
break
if(os.path.isdir(location)):
#print(location)
for fname in os.listdir(location):
file= location+'/'+fname
#print(file)
with open(file) as f:
message = email.message_from_file(f)
edge = [message['from']]
if 'To' in message:
edge = edge+re.split(r'\s*,\s*', message['to'])
if 'Cc' in message:
edge = edge + re.split(r'\s*,\s*', message['cc'])
if 'Bcc' in message:
edge = edge + re.split(r'\s*,\s*', message['bcc'])
edges.append(edge)
file_index.append(file)
In [180]:
l = pd.Series([len(x) for x in edges])
d = l.value_counts().sort_index().reset_index()
d.columns = ['index', 'freq']
d['freq']=np.log10(d['freq'])
ax = d.plot(x='index', y='freq', kind='scatter', alpha=0.6, xlim=[-10,max(d['index'])+10])
ax.set_ylabel('Frequency (log10)')
ax.set_xlabel('Hyperedge Size')
Out[180]:
In [181]:
len(edges)
Out[181]:
In [182]:
flat = [item for sublist in edges for item in sublist]
len(flat)
nodes = np.unique(flat)
len(nodes)
Out[182]:
In [183]:
edges[1:10]
Out[183]:
In [184]:
hg = hypergraph.Hypergraph()
In [185]:
for i, edge in enumerate(edges):
hg.add_edge(i,edge)
if(i%10000 == 0):
print(i)
In [186]:
edge_size = pd.Series([hg.edge[x].size for x in hg.edge])
d = edge_size.value_counts().sort_index().reset_index()
d.columns = ['index', 'freq']
d['freq']=np.log10(d['freq'])
ax = d.plot(x='index', y='freq', kind='scatter', alpha=0.6, xlim=[-10,max(d['index'])+10])
ax.set_ylabel('Frequency (log10)')
ax.set_xlabel('Hyperedge Size')
Out[186]:
Now let's look at node size, which might be refered to as degree by some.
In [187]:
node_size = pd.Series([hg.node[x].size for x in hg.node])
d = node_size.value_counts().sort_index().reset_index()
d.columns = ['index', 'freq']
d['freq']=np.log10(d['freq'])
d['index']=np.log10(d['index'])
ax = d.plot(x='index', y='freq', kind='scatter', alpha=0.6, xlim=[-1,max(d['index'])+1])
ax.set_ylabel('Frequency (log10)')
ax.set_xlabel('Vertex Size (log10)')
Out[187]:
In [188]:
sender = pd.Series([hg.edge[x].labels[0] for x in hg.edge])
sender.head()
Out[188]:
In [189]:
sender = pd.Series([hg.edge[x].labels[0] for x in hg.edge])
d = sender.value_counts().plot(kind='line', title='number of email sent by each user', logx=False, logy=True)
plt.xticks([])
plt.ylabel('outgoing hyperedges')
plt.xlabel('sender (decreasing order)')
Out[189]:
Might be worth inducing the 2 section and overlaying the two degree distributions on each other.
In [194]:
pd.set_option('precision',2)
In [203]:
edge_size = pd.Series([hg.edge[x].size for x in hg.edge])
edge_card = pd.Series([hg.edge[x].cardinality for x in hg.edge])
#hg.edge[[edge_size!=edge_card]]
edge_difference_index = np.where(edge_size!=edge_card)[0]
print(str(len(edge_difference_index))+" out of " + str(len(edge_size)) + " ("+
"{:.2f}".format(len(edge_difference_index)/len(edge_size)*100)+
"%) edges differed between cardinality and size")
In [207]:
[hg.edge[x].labels for x in edge_difference_index][:5]
Out[207]:
In [49]:
keys = hg.nodes
values = range(len(hg.nodes))
vertex_map = dict(zip(keys,values))
filename = "enronNumericHypergraphEdgelist.txt"
file = open(filename,'w')
@np.vectorize
def vertex_mapper(x):
return vertex_map[x]
for e in hg.edge:
tup = vertex_mapper(hg.edge[e].labels)
file.write(" ".join(str(elem) for elem in tup)+"\n")
#print(" ".join(str(elem) for elem in [vertex_map[x] for x in tup]))
file.close()
In [33]:
hg.node['tim.belden@enron.com'].labels
Out[33]:
In [ ]: